 # Simulation of Performance
## Show that ADP outperforms single digit tests
## While varying sample size and manipulation rate 

## Helper function:
## Define an individual digit preference as a benford distribution where some digits
## are replaced with a preferred digit

digitpreference <- function(n, p){
    # For sample size n, start with benford numbers
    mantissa  = exp(runif(n)*log(10))
    power = sample(seq(3,7),n, replace = T)
    benford = floor(mantissa *(10^power))

    # Pick 2 digits to prefer, for the whole dataset for this creator
    x <- sample(0:9, 2)

    # With p probability for each obs, replace a random digit in the number
    # for a preferred digit
    for(i in 1:n){
      if(runif(1) < p){
        # Turn number into string
        num = toString(benford[i])
        # Find position to replace
        pos = sample(1:nchar(num), 1)
        
        replace = sample(x, 1)
        
        substr(num, pos, pos) = toString(replace)
        benford[i] = num
      } 
    }
    return(benford)
}

## Helper function: benford data without digit preferences 
nodigitpreference <- function(n){
    # For sample size n, create benford numbers
    mantissa  = exp(runif(n)*log(10))
    power = sample(seq(3,7),n, replace = T)
    benford = floor(mantissa *(10^power))

    return(benford)
}


## Vary sample size per district N
## Vary rate of manipulation among bad districts p 

Nlist <- c(100, 500, 1000, 5000, 10000)
plist <- c(0.1, 0.2, 0.3, 0.4, 0.5)

SimResultsSingle <- data.frame(matrix(ncol = 8, nrow = 0))
colnames(SimResultsSingle) <- c("N", "p", "Afail", "Bfail", "Cfail", "Dfail", "Efail", "Ffail")

SimResultsAll <- data.frame(matrix(ncol = 0, nrow = 8))
colnames(SimResultsSingle) <- c("N", "p", "Afail", "Bfail", "Cfail", "Dfail", "Efail", "Ffail")

## Define manipulator and non-manipulator districts
districts_bad <- c("A", "B", "C")
districts_good <- c("D", "E", "F")

## Loop over all combinations of N and p 
for(N in Nlist){
  for(p in plist){

    print("N, p")
    print(N)
    print(p)

   ## Each district generates its own data
  # A,B,C have digit preferences
  # D,E,F do not 
    
    D = data.frame(matrix(ncol = 0, nrow = 0))

    for(d in districts_bad){
      tmp <- data.frame(matrix(ncol = 0, nrow = N))
      tmp$Expense <- digitpreference(N, p) 
      tmp$District <- d 
      D<- rbind(D, tmp)
    }

    for(e in districts_good){
      tmp <- data.frame(matrix(ncol = 0, nrow = N))
      tmp$Expense <- nodigitpreference(N) 
      tmp$District <- e 
      D<- rbind(D, tmp)
    }

    ## Prep data for Benford tests using our package
    
    library(digitanalysis)
    library(ggplot2)

    Data <- process_digit_data(raw_df = D, digit_columns = c('Expense'))

    #######################################
    # No plots 
    plot_toggle = FALSE 


    ### Single digit tests 

    # First digit test
    first_digit = all_digits_test(digitdata = Data, data_columns = 'Expense', digit_places = 1, omit_05 = 0, break_out='District',
                                  suppress_first_division_plots=TRUE, plot=plot_toggle)
    # Second digit test
    second_digit = all_digits_test(digitdata = Data, data_columns = 'Expense', digit_places = 2, omit_05 = 0, break_out='District',
                                  suppress_first_division_plots=TRUE, plot=plot_toggle)

    # Third digit test
    third_digit = all_digits_test(digitdata = Data, data_columns = 'Expense', digit_places = 3, omit_05 = 0, break_out='District',
                                  suppress_first_division_plots=TRUE, plot=plot_toggle)

    #Last digits test except first with expenditure
    last_digits = all_digits_test(digitdata = Data, data_columns = 'Expense', digit_places = -1, omit_05 = 0, break_out='District',
                                suppress_first_division_plots=TRUE, plot=plot_toggle)

    #All digits test with expenditure
    all_digits = all_digits_test(digitdata = Data, data_columns = 'Expense', skip_first_digit = FALSE, omit_05 = 0, break_out='District',
                                suppress_first_division_plots=TRUE, plot=plot_toggle)



    # Store results from single test
    singleresults<- rbind(as.numeric(first_digit$p_values$All), as.numeric(second_digit$p_values$All), as.numeric(third_digit$p_values$All), as.numeric(last_digits$p_values$All))
    failure <- colSums(singleresults < (0.05/4)*1/25)[2:7]

    SimResultsSingle <- rbind(SimResultsSingle, c(N, p, failure))  
    colnames(SimResultsSingle) <- c("N", "p", "Afail", "Bfail", "Cfail", "Dfail", "Efail", "Ffail")


    # Store All Digits Results
    all_results <- as.numeric(all_digits$p_values$All[2:7])
    failure_all <- as.numeric(all_results < 0.05*1/25)
    SimResultsAll <- rbind(SimResultsAll, c(N, p, failure_all))
    colnames(SimResultsAll) <- c("N", "p", "Afail", "Bfail", "Cfail", "Dfail", "Efail", "Ffail")
  }
}

save(SimResultsSingle, SimResultsAll, file = "SimPerformanceVaried.RData")

SimResultsSingle$TruePositiveRate = (SimResultsSingle$Afail + SimResultsSingle$Bfail + SimResultsSingle$Cfail)/12
SimResultsAll$TruePositiveRate = (SimResultsAll$Afail + SimResultsAll$Bfail + SimResultsAll$Cfail)/3

SimResultsSingle$FalsePositiveRate = (SimResultsSingle$Dfail + SimResultsSingle$Efail + SimResultsSingle$Ffail)/12
SimResultsAll$FalsePositiveRate = (SimResultsAll$Dfail + SimResultsAll$Efail + SimResultsAll$Ffail)/3

SimResultsSingle$Np = SimResultsSingle$N* SimResultsSingle$p 
SimResultsAll$Np = SimResultsAll$N* SimResultsAll$p 

save(SimResultsSingle, SimResultsAll, file = "SimPerformanceVaried.RData")

## Analyze 
pdf("SingleDigitConvergence.pdf")
ggplot(SimResultsSingle, aes(Np)) + 
  geom_point(aes(y = TruePositiveRate, color = "True Positive Rate")) + 
  geom_point(aes(y = FalsePositiveRate, color = "False Positive Rate")) +
  labs(color = "") + xlab("Expected Number of Manipulated Data Points N x p") +
  ylab("Share of Tests") + ggtitle("Single Digit Tests") +
  scale_y_continuous(labels = scales::percent)
dev.off()

pdf("AllDigitConvergence.pdf")
ggplot(SimResultsAll, aes(Np)) + 
  geom_point(aes(y = TruePositiveRate, color = "True Positive Rate")) + 
  geom_point(aes(y = FalsePositiveRate, color = "False Positive Rate")) +
  labs(color = "") + xlab("Expected Number of Manipulated Data Points N x p") +
  ylab("Share of Tests") + ggtitle("All Digit Tests")
dev.off()

library(xlsx)
write.csv(SimResultsSingle, "SimResultsSingle.csv")
write.csv(SimResultsAll, "SimResultsAll.csv")


